Data from Kaggle: - https://www.kaggle.com/parulpandey/palmer-archipelago-antarctica-penguin-data
Analysis Helpers:
- Python: https://www.kaggle.com/parulpandey/penguin-dataset-the-new-iris/notebook
- Python: https://www.kaggle.com/amandawest/penguin-dataset-the-new-iris/edit
- R: https://github.com/allisonhorst/palmerpenguins
rm(list=ls())
setwd("~/Desktop/r_notes") # best & simplest.
penguins <- read.csv(header=TRUE, file = "penguins_size.csv")
penguins_no_nas <- na.omit(penguins) # remove missing values from data frame
# relevant libraries
library(dplyr) # so we can use the pipe operator in part 2
library(plotly) # data visualization in part 3
penguins %>%
group_by(species) %>%
summarize(across(where(is.numeric), mean, na.rm = TRUE))
## # A tibble: 3 x 5
## species culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie 38.8 18.3 190. 3701.
## 2 Chinstrap 48.8 18.4 196. 3733.
## 3 Gentoo 47.5 15.0 217. 5076.
And this is the correlation plot, which will provide some insight into how our scatter plots will end up looking:
penguins_cor <- na.omit(penguins[, c(3,5,6)])
res <- cor(penguins_cor)
round(res, 2)
## culmen_length_mm flipper_length_mm body_mass_g
## culmen_length_mm 1.00 0.66 0.60
## flipper_length_mm 0.66 1.00 0.87
## body_mass_g 0.60 0.87 1.00
plot_ly(penguins,
x = ~flipper_length_mm,
y = ~body_mass_g,
color = ~flipper_length_mm,
size = ~flipper_length_mm) %>%
layout(xaxis = list(title = "Flipper Length (mm)"),
yaxis = list (title = "Body Mass (g)"))
plot_ly(penguins,
x = ~flipper_length_mm,
y = ~body_mass_g,
color = ~species,
size = ~flipper_length_mm) %>%
layout(xaxis = list(title = "Flipper Length (mm)"),
yaxis = list (title = "Body Mass (g)"))
Finis!